Description of the simulations

Here we simulate from the true model, with the following parameters.

  • 2 unknown factors in W
  • V intercept
  • X intercept
  • Genewise dispersion

For each simulated data, we fit a ZINB model with all the possible combinations (16) of the following parameters:

  • Dimension of W: 1 through 4
  • V intercept: yes, no
  • Dispersion: common, genewise

Estimates

See bias, variance, and MSE in separate Rmd files (simAllen_estimates_25.Rmd, simAllen_estimates_5.Rmd, and simAllen_estimates_75.Rmd).

Impact on dimensionality reduction

par(mar=c(10.1,4.1,4.1,2.1))
ds = 'Allen'
for (nc in c(100, 1000)){
  for (offs in c(-3.5, 0, 3.5)){
    for (aa in c(1, .85)){
      pref = sprintf('datasets/sim%s_%s_a%s_offs%s_seed9128',
                     ds, nc, aa, offs)
      load(paste0(pref, '_dist.rda'))
      corr <- lapply(1:15, function(i) rowMeans(sapply(res, function(x) x[[i]])))
      corr = do.call(cbind, corr)
      corr = data.frame(corr, stringsAsFactors = F)
      colnames(corr) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
                          "PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC', 
                          'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
      main = paste0("Correlation between true and estimated sample distances in W\n", nc, " cells, offset ", offs, ", clustering ", aa)
      boxplot(corr, main = main, border = c(3, 1, 2, rep(1,12)), xlab = "",
              ylab = "Correlation", las = 2)
      abline(h = 1, lty = 2)
      abline(v = 5.5, lty = 2)
      abline(v = 1.5, lty = 2)
      abline(v = 10.5, lty = 2)
    }
  }
}

corr = lapply(c(100, 1000), function(nc){
  lapply(c(1, .85), function(aa){
    lapply(c(-3.5, 0, 3.5), function(offs){
      pp = sprintf('datasets/simAllen_%s_a%s_offs%s_seed9128', nc, aa, offs)
      load(paste0(pp, '_dist.rda'))
      cc <- lapply(1:15, function(i) rowMeans(sapply(res, function(x) x[[i]])))
      do.call(cbind, cc)
    })
  })
})
corr = do.call(rbind, do.call(rbind, do.call(rbind, corr)))
corr = data.frame(corr, stringsAsFactors = F)
colnames(corr) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
                          "PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC', 
                          'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')

corr$pzero = c(rep(rep(c(.25,.45,.75), each = 100), 2),
               rep(rep(c(.25,.45,.75), each = 1000), 2))
corr$var = c(rep(paste('Clustering', 1:2), each = 300),
             rep(paste('Clustering', 1:2), each = 3000))
corr$nc = c(rep('100', 600), rep('1000', 6000))
corrMolten = melt(corr, id.vars = c('pzero', 'var', 'nc'))
corrSum = corrMolten %>% group_by(pzero, var, variable, nc) %>%
  summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
  as.data.frame()
corrSum$pzero = factor(corrSum$pzero)

c1 = corrSum[corrSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
allen = ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
  geom_point() + geom_line() + labs(col='') + 
  theme_bw() + xlab('Zero Fraction') + facet_grid(nc ~ var) +
  ylab('Correlation between true and estimated sample distances in W')
allen

#ggsave(filename="paper/6680489mtyrjx/allen_correlations.pdf", plot = allen,
#       width = 5, height = 5)

Silhouette width

for (nc in c(100, 1000)){
  for (offs in c(-3.5, 0, 3.5)){
    for (aa in c(1, .85)){
      pref = sprintf('datasets/sim%s_%s_a%s_offs%s_seed9128',
                     ds, nc, aa, offs)
      load(paste0(pref, '_dist.rda'))
      sil <- lapply(16:length(res[[1]]), function(i){
        rowMeans(sapply(res, function(x) x[[i]]))
      })
      sil = do.call(cbind, sil)
      sil = data.frame(sil,stringsAsFactors = F)
      colnames(sil) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
                          "PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC', 
                          'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
      main = paste0("Silhouette width of true labels\n", nc, " cells, offset ", offs, ", clustering ", aa)
      boxplot(sil, main = main, border=c(3, 1, 2, rep(1,10)), xlab="", 
              ylab="Silhouette width", las=2)
      boxplot(sil - sil[, 1], main = paste0("Difference of ", main),
              border = c(3, 1, 2, rep(1,12)), xlab = "", las = 2,
              ylab = "Silhouette width - True Silhouette width")
      abline(h = 0, lty = 2)
    }
  }
}

sil = lapply(c(100, 1000), function(nc){
  lapply(c(1, .85), function(aa){
    lapply(c(-3.5, 0, 3.5), function(offs){
      pp = sprintf('datasets/simAllen_%s_a%s_offs%s_seed9128', nc, aa, offs)
      load(paste0(pp, '_dist.rda'))
      ss <- lapply(16:length(res[[1]]), function(i){
        rowMeans(sapply(res, function(x) x[[i]]))
      })
      do.call(cbind, ss)
    })
  })
})
sil = do.call(rbind, do.call(rbind, do.call(rbind, sil)))
sil = data.frame(sil, stringsAsFactors = F)
colnames(sil) <- c("true W", paste0("zinb k=", 1:4), "PCA", "PCA TC",
                          "PCA TMM", "PCA DESeq2", "PCA FQ", 'ZIFA', 'ZIFA TC', 
                          'ZIFA TMM', 'ZIFA DESeq2', 'ZIFA FQ')
sil = sil - sil[,1]
sil$pzero = c(rep(rep(c(.25,.45,.75), each = 100), 2),
               rep(rep(c(.25,.45,.75), each = 1000), 2))
sil$var = c(rep(paste('Clustering', 1:2), each = 300),
             rep(paste('Clustering', 1:2), each = 3000))
sil$nc = c(rep('100', 600), rep('1000', 6000))
silMolten = melt(sil, id.vars = c('pzero', 'var', 'nc'))
silSum = silMolten %>% group_by(pzero, var, variable, nc) %>%
  summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
  as.data.frame()
silSum$pzero = factor(silSum$pzero)

c1 = silSum[silSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
allen = ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
  geom_point() + geom_line() + labs(col='') + 
  theme_bw() + xlab('Zero Fraction') + facet_grid(nc ~ var) +
  ylab('Silhouette width - True Silhouette width') + 
  geom_hline(yintercept = 0, col = 'gray')
allen

#ggsave(filename="paper/6680489mtyrjx/allen_correlations.pdf", plot = allen,
#       width = 5, height = 5)

For one simulated dataset with the right parameters (Vintercept, genewise dispersion, K= 2).

par(mfrow = c(3,4))
pref = "datasets/simAllen_100_a1_offs-3.5_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaFQ.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
fq <- betweenLaneNormalization(counts, which="full")
pca <- prcomp(t(log1p(fq)))
par(mfrow=c(3,4))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.25', 
     xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.25', 
     xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA FQ\nfZero = 0.25')
plot(zifaFQ[[1]], col = biotrue, main = 'ZIFA FQ\nfZero = 0.25')

pref = "datasets/simAllen_100_a1_offs0_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaTC.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
mult = sum(counts) / (ncol(counts) * nrow(counts))
fact = colSums(counts)
tc = mult * (t(counts) / fact)
pca <- prcomp(log1p(tc))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.45', 
     xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.45', 
     xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA TC\nfZero = 0.45')
plot(zifaTC[[1]], col = biotrue, main = 'ZIFA TC\nfZero = 0.45')

pref = "datasets/simAllen_100_a1_offs3.5_seed9128"
load(paste0(pref, ".rda"))
load(paste0(pref, "_zifaTC.rda"))
load(paste0(pref, "_fitted.rda"))
biotrue = factor(bio)
fit = fittedSim[[2]][[1]]
counts = t(simData[[1]]$counts)
counts = counts[rowSums(counts) != 0, ]
mult = sum(counts) / (ncol(counts) * nrow(counts))
fact = colSums(counts)
tc = mult * (t(counts) / fact)
pca <- prcomp(log1p(tc))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.75', 
     xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.75', 
     xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA TC\nfZero = 0.75')
plot(zifaTC[[1]], col = biotrue, main = 'ZIFA TC\nfZero = 0.75')

Model fit

Let’s look at model fit when chosen parameters are correct (V, genewise dispersion, K=2).

From Sandrine’s slide.

Mean, variance, and zero probability for the ZINB model are \[E[Y_{i,j}] = (1 - \pi_{i,j})\mu_{i,j},\] \[var(Y_{i,j}) = (1 - \pi_{i,j})\mu_{i,j}(1 + \mu_{i,j}(\phi_j + \pi_{i,j})),\] \[P(Y_{i,j} = 0) = \pi_{i,j} + (1 - \pi_{i,j})(1 + \phi_j \mu_{i,j})^{\frac{1}{\phi_j}}.\]

For the NB model, \(\pi_{i,j}=0\). We fitted the NB using edgeR after full quantile normalization.

Zero Fraction 45%

computeExp <- function(zinbModel){
    (1 - t(getPi(zinbModel))) * t(getMu(zinbModel))
}

computeVar <- function(zinbModel){
  mu = t(getMu(zinbModel))
  pi = t(getPi(zinbModel))
  phi = exp(-getZeta(zinbModel))
  (1 - pi) * mu * (1 + mu*(phi + pi))
}

computeP0 <- function(zinbModel){
  mu = t(getMu(zinbModel))
  pi = t(getPi(zinbModel))
  phi = exp(-getZeta(zinbModel))
  pi + (1 - pi) * (1 + phi * mu) ^ (-1/phi)
}

plotMD <- function(x, y, xlim = c(0,10), ylim = c(-5, 5),
                   main = 'ZINB: MD-plot estimated vs. observed mean count, log scale'){
  mm = .5*(x + y)
  dd = x - y
  smoothScatter(mm, dd, xlim = xlim, ylim = ylim, xlab = 'Mean', ylab = 'Diff', main = main)
  abline(h = 0, col = 'gray')
  fit = loess(dd ~ mm)
  xpred = seq(0, 10, .1)
  pred = predict(fit, xpred)
  lines(xpred, pred, col = 'red', type = 'l', lwd=2)
}

fitNB <- function(rda, i = 1){
  load(rda)
  counts = t(simData[[i]]$counts)
  phenoData = data.frame(bio)
  set = newSeqExpressionSet(counts, phenoData = phenoData)
  fq = betweenLaneNormalization(set, which = "full", offset = T)
  disp = estimateDisp(counts(fq), offset = -offst(fq)) 
  fit = glmFit(counts(fq), dispersion = disp$tagwise.dispersion, offset = -offst(fq))
  list(fitted = fit$fitted.values, disp = disp$tagwise.dispersion)
}

There is one outlier for zinb fit.

pp = 'datasets/simAllen_1000_a1_offs0_seed9128'
# zinb
load(paste0(pp, '_fitted.rda'))
load(paste0(pp, '.rda'))
zz = fittedSim[[2]][[1]]
zinbEY = rowMeans(log1p(computeExp(zz)))
zinbPY0 = rowMeans(computeP0(zz))
# observed
counts = t(simData[[1]]$counts)
logAveCount = rowMeans(log1p(counts))
prop0 = rowMeans(counts == 0)
# edgeR
nb25 = fitNB(paste0(pp, '.rda'))
## Design matrix not provided. Switch to the classic mode.
nbEY = rowMeans(log1p(nb25$fitted))
nbPY0 = rowMeans((1 + nb25$fitted * nb25$disp)^(-1/nb25$disp))

MD-plot estimated vs. observed mean count, log scale, zoomed (we do not see the outlier)

par(mfrow=c(1,2))
plotMD(logAveCount, zinbEY, xlim = c(0, 7), ylim = c(-6, 6),
       main = 'ZINB')
plotMD(logAveCount, nbEY, xlim = c(0, 7), ylim = c(-6, 6),
       main = 'NB, edgeR')

MD-plot estimated vs. observed zero probability

par(mfrow=c(1,2))
plotMD(prop0, zinbPY0, xlim = c(0, 1), ylim = c(-1, 1),
       main = 'ZINB')
plotMD(prop0, nbPY0, xlim = c(0, 1), ylim = c(-1, 1), 
       main = 'NB, edgeR')

par(mfrow=c(1,1))
smoothScatter(logAveCount, rowMeans(counts == 0),
              xlab = 'log average count', xlim = c(0,7),
              ylab = 'proportion of zeros', ylim = c(0,1),
              main = 'Zero probability versus Mean')
fit = loess(zinbPY0 ~ zinbEY)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='green',type = 'l', lwd=2, ylim = c(0,1), xlim = c(0,7))
fit = loess(nbPY0 ~ nbEY)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='red', type='l', lwd=2, ylim = c(0,1), xlim = c(0,7))
fit = loess(rowMeans(counts == 0) ~ logAveCount)
xpred = seq(0,9,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col = 'blue',type = 'l',lwd=2, ylim = c(0,1), xlim = c(0,7))
legend('topright', c('observed', 'zinb', 'nb, edgeR'), 
   lty=1, col=c('blue', 'green', 'red'), bty='n', cex=.75)

par(mfrow = c(1, 2))
smoothScatter(exp(-simModel@zeta),exp(-zz@zeta), 
              xlab = 'True Dispersion', ylab = 'Estimated Dispersion', 
              ylim = c(0, 10),xlim = c(0,10), 
              main = 'ZINB')
abline(a = 0, b = 1, col = 'gray')
smoothScatter(exp(-simModel@zeta), nb25$disp, 
              xlab = 'True Dispersion', ylab = 'Estimated Dispersion', 
              ylim = c(0, 10),xlim = c(0,10), 
              main = 'NB, edgeR')
abline(a = 0, b = 1, col = 'gray')

par(mfrow = c(1, 1))

xpred = seq(0, 1, .05)
fitzinb = loess(exp(-zz@zeta) ~ rowMeans(counts == 0))
predzinb = predict(fitzinb, xpred)
fitnb = loess(nb25$disp ~ rowMeans(counts == 0))
prednb = predict(fitnb, xpred)
smoothScatter(rowMeans(counts == 0), exp(-simModel@zeta),
              xlab = 'Observed zero probability',ylim = c(0, 15),
              ylab = 'True dispersion',xlim = c(0,1), 
              main = 'True Dispersion versus observed zero probability')
lines(xpred, predzinb, col = 'green', type = 'l', lwd=2)
lines(xpred, prednb, col = 'red', type = 'l', lwd=2)
legend('topright',c('zinb','nb, edgeR'),lty=1,col=c('green','red'), bty='n', cex=.75)